1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(broom)
library(ggridges)

1.2 Data

1.2.1 Metabolite Abundances

# Cells #
vpa.cell.neg.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_negmode.csv")
vpa.cell.pos.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_posmode.csv")
# Media #
vpa.med.neg.raw <- read_csv("./data/abundances/vpa_1and2_media_target_negmode.csv")
vpa.med.pos.raw <- read_csv("./data/abundances/vpa_1and2_media_target_posmode.csv")

1.2.2 Compound Information

# Cells #
vpa.cell.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_negmode_cmpnd_info.csv")
vpa.cell.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_posmode_cmpnd_info.csv")
# Media #
vpa.med.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_negmode_cmpnd_info.csv")
vpa.med.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_posmode_cmpnd_info.csv")
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Group)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Group == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  smpls <- raw.dataframe %>%
    filter(Group == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  # replace the missing values in the real samples with the minimum of the samples
  # then take the log
  smpls.noNA <- raw.dataframe  %>%
    filter(Group == "sample") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      )
  # QC
  QC <- raw.dataframe %>%
    filter(Group == "QC") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Group == "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # replace the missing values in solv and empty samples with 2 - for PCA analysis
  # then take the log
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe %>%
    filter(Group != "sample" & Group != "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Group != "sample" & Group != "QC") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  # combine them together back into one data frame
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

# bind all 4 compound info df into 1 
full.vpa.cmpnd <- vpa.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(vpa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>% 
  bind_rows(vpa.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
  bind_rows(vpa.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.3) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA Only Exp") +
  ylim(0, 1100)

full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.5) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA Only Exp") +
  facet_wrap(~ Set) +
  ylim(0, 1100)

The compounds included in the Cells / Neg mode and Cells / Pos mode datasets loook to have a good spread of masses and retention times within the expected RT window of 0 to 29min and mass window of 50 to 1000Da. A smaller number of compounds were detected in the media samples, and the larger molecules that were detected in cells were not detected in media. This is not a surprising result, media in general is expected to have a lower number of compounds than the cells themselves, as cells produce a wide variety of molecules that are not exported to the media.

Q: Which compounds were found in one or more of the data types?

### vpa cell join ###
vpa.cell.cmpnd.join <- vpa.cell.neg.compound.info %>% 
  inner_join(vpa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(vpa.cell.cmpnd.join$compound_full.c.n)
  [1] "Glycine"                                    
  [2] "Acetoin"                                    
  [3] "Alanine"                                    
  [4] "Sarcosine"                                  
  [5] "Beta-Alanine"                               
  [6] "2-Aminobutyric Acid"                        
  [7] "BAIBA"                                      
  [8] "Serine"                                     
  [9] "Hypotaurine"                                
 [10] "Uracil"                                     
 [11] "Creatinine"                                 
 [12] "5,6-Dihydrouracil"                          
 [13] "Proline"                                    
 [14] "Valine"                                     
 [15] "Threonine"                                  
 [16] "Taurine"                                    
 [17] "4-Oxoproline"                               
 [18] "Ketoleucine"                                
 [19] "trans-4-Hydroxyproline"                     
 [20] "Creatine"                                   
 [21] "Isoleucine"                                 
 [22] "Leucine"                                    
 [23] "Asparagine"                                 
 [24] "5-Hydroxyhexanoic Acid"                     
 [25] "Aspartic Acid"                              
 [26] "Adenine"                                    
 [27] "Hypoxanthine"                               
 [28] "O-Phosphoethanolamine"                      
 [29] "Glutamine"                                  
 [30] "Lysine"                                     
 [31] "N-Acetylserine"                             
 [32] "Glutamic Acid"                              
 [33] "Methionine"                                 
 [34] "D-Ribose"                                   
 [35] "Guanine"                                    
 [36] "Xanthine"                                   
 [37] "3-Sulfinoalanine"                           
 [38] "Histidine"                                  
 [39] "Orotic Acid"                                
 [40] "Allantoin"                                  
 [41] "Fucose"                                     
 [42] "Phenylalanine"                              
 [43] "Pyridoxal"                                  
 [44] "Cysteic Acid"                               
 [45] "Pyridoxine"                                 
 [46] "3-Methylhistidine"                          
 [47] "G3P"                                        
 [48] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
 [49] "N-Carbamoyl-L-aspartic Acid"                
 [50] "Tyrosine"                                   
 [51] "D-Galactitol"                               
 [52] "Phosphocholine"                             
 [53] "N-alpha-Acetylglutamine"                    
 [54] "Tryptophan"                                 
 [55] "Phosphocreatine"                            
 [56] "Glycerylphosphorylethanolamine"             
 [57] "O-Succinylhomoserine"                       
 [58] "Pantothenic Acid"                           
 [59] "GlcNAc"                                     
 [60] "Cystathionine"                              
 [61] "Deoxycytidine"                              
 [62] "Cytidine"                                   
 [63] "Uridine"                                    
 [64] "Palmitoleic Acid"                           
 [65] "Glycerol 1-phosphoserine"                   
 [66] "D-Glucose 6-phosphate"                      
 [67] "Thiamine"                                   
 [68] "Inosine"                                    
 [69] "Myoinositol-methyl-phosphate"               
 [70] "Linoleic Acid"                              
 [71] "Guanosine"                                  
 [72] "Xanthosine"                                 
 [73] "L-Arginosuccinic Acid"                      
 [74] "Ricinoleic Acid"                            
 [75] "Glutathione"                                
 [76] "11Z,14Z-Eicosadienoic Acid (20:2 n-6)"      
 [77] "CMP"                                        
 [78] "UMP"                                        
 [79] "3-Phosphoglyceroinositol"                   
 [80] "AMP"                                        
 [81] "GMP"                                        
 [82] "SAH"                                        
 [83] "CDP"                                        
 [84] "ADP"                                        
 [85] "GDP"                                        
 [86] "LysoPE(18:2)"                               
 [87] "LysoPE(18:0)"                               
 [88] "dTTP"                                       
 [89] "CTP"                                        
 [90] "UTP"                                        
 [91] "CDP-Choline"                                
 [92] "dATP"                                       
 [93] "ATP"                                        
 [94] "GTP"                                        
 [95] "ADP-Ribose"                                 
 [96] "UDP-Galactose"                              
 [97] "UDP-Glucose"                                
 [98] "UDP-Glucuronic acid"                        
 [99] "ADP-Glucose"                                
[100] "GDP-Glucose"                                
[101] "UDP-GalNAc"                                 
[102] "UDP-GlcNAc"                                 
[103] "GSSG"                                       
[104] "NAD"                                        
[105] "NADH"                                       
[106] "NADP"                                       
[107] "PC(36:4)"                                   
[108] "FAD"                                        
[109] "Acetyl-CoA"                                 
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.neg.compound.info), 1)
[1] 51.7
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.pos.compound.info), 1)
[1] 61.9
### vpa media join ###
vpa.med.cmpnd.join <- vpa.med.neg.compound.info %>% 
  inner_join(vpa.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / media
print(vpa.med.cmpnd.join$compound_full.m.n)
 [1] "Alanine"                  "Serine"                  
 [3] "Uracil"                   "Valine"                  
 [5] "Threonine"                "Taurine"                 
 [7] "Thymine"                  "4-Oxoproline"            
 [9] "Isoleucine"               "Leucine"                 
[11] "Glutamine"                "Methionine"              
[13] "Xanthine"                 "Histidine"               
[15] "Allantoin"                "Phenylalanine"           
[17] "Uric Acid"                "D-Glucose"               
[19] "Tyrosine"                 "D-Galactitol"            
[21] "Cysteine-S-sulfonic Acid" "Tryptophan"              
[23] "Pantothenic Acid"         "Cytidine"                
[25] "Uridine"                  "Inosine"                 
[27] "Guanosine"                "Folic Acid"              
# percent of media / neg compounds found in media / pos
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.neg.compound.info), 1)
[1] 51.9
# percent of media / pos compounds found in media / neg
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.pos.compound.info), 1)
[1] 50
### vpa all modes join ###
vpa.all.cmpnd.join <- vpa.cell.cmpnd.join %>% 
  inner_join(vpa.med.cmpnd.join, by = "cas_id") %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
nrow(vpa.all.cmpnd.join)
[1] 23
# check for any mass issues between all 4 modes 
vpa.all.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

# any rt inconsistencies?
vpa.all.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?

MissingPerSamplePlot(vpa.cell.neg.raw, "VPAnC") +
  ggtitle("Missing Per Sample\nVPA-only / Cells / Neg Mode")

MissingPerSamplePlot(vpa.cell.pos.raw, "VPApC") +
  ggtitle("Missing Per Sample\nVPA-only / Cells / Pos Mode")

MissingPerSamplePlot(vpa.med.neg.raw, "VPAnM") +
  ggtitle("Missing Per Sample\nVPA-only / Media / Neg Mode")

MissingPerSamplePlot(vpa.med.pos.raw, "VPApM") +
  ggtitle("Missing Per Sample\nVPA-only / Media / Pos Mode")

A: No, sample files across both datasets have very few missing values. The green-colored bars, marked “sample” are the actual experimental samples. Whereas the “solv”, or “solvent”, and “empty” samples are negative control samples that are expected to have many missing values and/or low compound abundances. They will be used to narrow down the list of features later on in the analysis.

Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.

Note: The MissingPerCompound function considers only the Group == "sample" samples.

MissingPerCompound(vpa.cell.neg.raw, "VPAnC") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
(vpa.cell.pos.cmpnd.to.excl<- MissingPerCompound(vpa.cell.pos.raw, "VPApC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 VPApC110        9        23       39.1
MissingPerCompound(vpa.med.neg.raw, "VPAnM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
(vpa.med.pos.cmpnd.to.excl <- MissingPerCompound(vpa.med.pos.raw, "VPApM") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 VPApM34         6        24         25

A: No compounds need to be excluded from the Cell / Neg Mode and Media / Neg Mode datasets, but 1 compound each will be excluded from the Cell / Pos Mode and Media / Pos Mode datasets.

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

# get the mean abundance of each compound, grouped by solvent vs empty sample vs experimental sample
vpa.cell.neg.raw.grp.mean <- vpa.cell.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
# plot the log2 density distribution of the means
vpa.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Cells / Negative Mode\nGrouped by sample type")

# for plotting purposes, to make the plot neater
vpa.cell.neg.raw.grp.mean.order <- vpa.cell.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vpa.cell.neg.raw %>% 
  select(Samples, Group, starts_with("VPAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Negative Mode")

# compound mean by group only, ordered by increasing abundance in the experimental samples
vpa.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.raw.grp.diff <- vpa.cell.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-3, 13.5) +
  ylim(-3, 13.5) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.neg.cmpnd.to.incl <- vpa.cell.neg.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff))
# how many compound were there in the raw negative mode dataset?
nrow(vpa.cell.neg.raw.grp.diff)
[1] 211
# how many compounds are retained for further analysis?
nrow(vpa.cell.neg.cmpnd.to.incl)
[1] 179

2.3.2 Cells / Positive Mode

vpa.cell.pos.raw.grp.mean <- vpa.cell.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Cells / Positive Mode\nGrouped by sample type")

vpa.cell.pos.raw.grp.mean.order <- vpa.cell.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vpa.cell.pos.raw %>% 
  select(Samples, Group, starts_with("VPApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")  +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.raw.grp.diff <- vpa.cell.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-1, 15) +
  ylim(-1, 15) +
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.pos.cmpnd.to.incl <- vpa.cell.pos.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff)) %>% 
  filter(!(Compound %in% vpa.cell.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.cell.pos.raw.grp.diff)
[1] 176
# how many compounds are retained for further analysis?
nrow(vpa.cell.pos.cmpnd.to.incl)
[1] 142

2.3.3 Media / Negative Mode

vpa.med.neg.raw.grp.mean <- vpa.med.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPAnM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Media / Negative Mode\nGrouped by sample type")

vpa.med.neg.raw.grp.mean.order <- vpa.med.neg.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vpa.med.neg.raw %>% 
  select(Samples, Group, starts_with("VPAnM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1.5) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.med.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Negative Mode")

vpa.med.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")

vpa.med.neg.raw.grp.diff <- vpa.med.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-4, 4) +
  ylim(-1, 4) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.neg.cmpnd.to.incl <- vpa.med.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# how many compound were there in the raw dataset?
nrow(vpa.med.neg.raw.grp.diff)
[1] 54
# how many compounds are retained for further analysis?
nrow(vpa.med.neg.cmpnd.to.incl)
[1] 41

2.3.4 Media / Positive Mode

vpa.med.pos.raw.grp.mean <- vpa.med.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPApM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Media / Positive Mode\nGrouped by sample type")

vpa.med.pos.raw.grp.mean.order <- vpa.med.pos.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vpa.med.pos.raw %>% 
  select(Samples, Group, starts_with("VPApM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1.5) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.med.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Positive Mode")

vpa.med.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")

vpa.med.pos.raw.grp.diff <- vpa.med.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-6, 2) +
  ylim(-1, 7) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.pos.cmpnd.to.incl <- vpa.med.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vpa.med.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.med.pos.raw.grp.diff)
[1] 56
# how many compounds are retained for further analysis?
nrow(vpa.med.pos.cmpnd.to.incl)
[1] 36

3 Data Prep and Preliminary Analysis

3.1 Cleanup

# replace missing with minimum for each Group in each dataset and log2() transform the data:
# exclude compound that have a >20% NA count across samples
vpa.cell.neg.noNA <- vpa.cell.neg.raw %>% 
  select(Samples:Experiment, one_of(vpa.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPAnC")
vpa.cell.pos.noNA <- vpa.cell.pos.raw %>% 
  select(Samples:Experiment, one_of(vpa.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPApC")
vpa.med.neg.noNA <- vpa.med.neg.raw %>% 
  select(Samples:Experiment, one_of(vpa.med.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPAnM")
vpa.med.pos.noNA <- vpa.med.pos.raw %>% 
  select(Samples:Experiment, one_of(vpa.med.pos.cmpnd.to.incl$Compound)) %>%  
  ReplaceNAwMinLogTransformSingle("VPApM")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

vpa.cell.neg.noNA.gathered <- vpa.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.neg.noNA) == "VPAnC1"):ncol(vpa.cell.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")

# same data format, but as ridge plots
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")

# experimental samples only
vpa.cell.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")

# overlay the distributions for another look
vpa.cell.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")

3.2.2 Cells / Positive Mode

vpa.cell.pos.noNA.gathered <- vpa.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.pos.noNA) == "VPApC1"):ncol(vpa.cell.pos.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")

# same data format, but as ridge plots
vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")

# experimental samples only
vpa.cell.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")

# overlay the distributions for another look
vpa.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")

3.2.3 Media / Negative Mode

vpa.med.neg.noNA.gathered <- vpa.med.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.med.neg.noNA) == "VPAnM1"):ncol(vpa.med.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")

# same data format, but as ridge plots
vpa.med.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")

# experimental samples only
vpa.med.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")

# overlay the distributions for another look
vpa.med.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")

3.2.4 Media / Positive Mode

vpa.med.pos.noNA.gathered <- vpa.med.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.med.pos.noNA) == "VPApM1"):ncol(vpa.med.pos.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")

# same data format, but as ridge plots
vpa.med.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")

# experimental samples only
vpa.med.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")

# overlay the distributions for another look
vpa.med.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
vpa.cell.neg.full.pca <- vpa.cell.neg.noNA %>% 
  select(starts_with("VPAnC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.full.pca.x <- as.data.frame(vpa.cell.neg.full.pca$x)
row.names(vpa.cell.neg.full.pca.x) <- vpa.cell.neg.noNA$Samples
vpa.cell.neg.full.pca.x <- vpa.cell.neg.full.pca.x %>% 
  bind_cols(vpa.cell.neg.noNA %>% select(Group:Experiment))
vpa.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (90.5% Var)") +
  ylab("PC2 (3.1%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Negative Mode")

### Samples and QC ###
vpa.cell.neg.smpl.QC.pca <- vpa.cell.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.QC.pca.x <- as.data.frame(vpa.cell.neg.smpl.QC.pca$x)
vpa.cell.neg.smpl.QC.pca.x <- vpa.cell.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.neg.smpl.QC.pca.x) <- vpa.cell.neg.smpl.QC.pca.x$Samples
vpa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (35.1% Var)") +
  ylab("PC2 (19.7%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (16.1% Var)") +
  ylab("PC4 (6.4%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")

### Experimental Samples Only ###
vpa.cell.neg.smpl.pca <- vpa.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.pca.x <- as.data.frame(vpa.cell.neg.smpl.pca$x)
vpa.cell.neg.smpl.pca.x <- vpa.cell.neg.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.neg.smpl.pca.x) <- vpa.cell.neg.smpl.pca.x$Samples
vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (34.3% Var)") +
  ylab("PC2 (23.8%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (16.6% Var)") +
  ylab("PC4 (7.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")

3.3.2 Cells / Positive Mode

### PCA on all Samples ###
vpa.cell.pos.full.pca <- vpa.cell.pos.noNA %>% 
  select(starts_with("VPApC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.full.pca.x <- as.data.frame(vpa.cell.pos.full.pca$x)
row.names(vpa.cell.pos.full.pca.x) <- vpa.cell.pos.noNA$Samples
vpa.cell.pos.full.pca.x <- vpa.cell.pos.full.pca.x %>% 
  bind_cols(vpa.cell.pos.noNA %>% select(Group:Experiment))
vpa.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (90.0% Var)") +
  ylab("PC2 (4.2%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Positive Mode")

### Samples and QC ###
vpa.cell.pos.smpl.QC.pca <- vpa.cell.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.QC.pca.x <- as.data.frame(vpa.cell.pos.smpl.QC.pca$x)
vpa.cell.pos.smpl.QC.pca.x <- vpa.cell.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.pos.smpl.QC.pca.x) <- vpa.cell.pos.smpl.QC.pca.x$Samples
vpa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (55.0% Var)") +
  ylab("PC2 (15.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.4% Var)") +
  ylab("PC4 (5.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")

### Experimental Samples Only ###
vpa.cell.pos.smpl.pca <- vpa.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.pca.x <- as.data.frame(vpa.cell.pos.smpl.pca$x)
vpa.cell.pos.smpl.pca.x <- vpa.cell.pos.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.pos.smpl.pca.x) <- vpa.cell.pos.smpl.pca.x$Samples
vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (57.5% Var)") +
  ylab("PC2 (19.6%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.2% Var)") +
  ylab("PC4 (4.7%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")

3.3.3 Media / Negative Mode

### PCA on all Samples ###
vpa.med.neg.full.pca <- vpa.med.neg.noNA %>% 
  select(starts_with("VPAnM")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.med.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.full.pca.x <- as.data.frame(vpa.med.neg.full.pca$x)
row.names(vpa.med.neg.full.pca.x) <- vpa.med.neg.noNA$Samples
vpa.med.neg.full.pca.x <- vpa.med.neg.full.pca.x %>% 
  bind_cols(vpa.med.neg.noNA %>% select(Group:Experiment))
vpa.med.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (75.8% Var)") +
  ylab("PC2 (8.4%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Negative Mode")

### Samples and QC ###
vpa.med.neg.smpl.QC.pca <- vpa.med.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.smpl.QC.pca.x <- as.data.frame(vpa.med.neg.smpl.QC.pca$x)
vpa.med.neg.smpl.QC.pca.x <- vpa.med.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.med.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.neg.smpl.QC.pca.x) <- vpa.med.neg.smpl.QC.pca.x$Samples
vpa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (56.8% Var)") +
  ylab("PC2 (25.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")

vpa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (9.0% Var)") +
  ylab("PC4 (4.1%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")

### Experimental Samples Only ###
vpa.med.neg.smpl.pca <- vpa.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.smpl.pca.x <- as.data.frame(vpa.med.neg.smpl.pca$x)
vpa.med.neg.smpl.pca.x <- vpa.med.neg.smpl.pca.x %>% 
  bind_cols(
    vpa.med.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.neg.smpl.pca.x) <- vpa.med.neg.smpl.pca.x$Samples
vpa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (58.2% Var)") +
  ylab("PC2 (27.5%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")

vpa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.3% Var)") +
  ylab("PC4 (2.4%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")

3.3.4 Media / Positive Mode

### PCA on all Samples ###
vpa.med.pos.full.pca <- vpa.med.pos.noNA %>% 
  select(starts_with("VPApM")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.med.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.full.pca.x <- as.data.frame(vpa.med.pos.full.pca$x)
row.names(vpa.med.pos.full.pca.x) <- vpa.med.pos.noNA$Samples
vpa.med.pos.full.pca.x <- vpa.med.pos.full.pca.x %>% 
  bind_cols(vpa.med.pos.noNA %>% select(Group:Experiment))
vpa.med.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (85.6% Var)") +
  ylab("PC2 (6.3%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Positive Mode")

### Samples and QC ###
vpa.med.pos.smpl.QC.pca <- vpa.med.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.smpl.QC.pca.x <- as.data.frame(vpa.med.pos.smpl.QC.pca$x)
vpa.med.pos.smpl.QC.pca.x <- vpa.med.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.med.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.pos.smpl.QC.pca.x) <- vpa.med.pos.smpl.QC.pca.x$Samples
vpa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (72.1% Var)") +
  ylab("PC2 (15.6%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")

vpa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (6.0% Var)") +
  ylab("PC4 (2.5%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")

### Experimental Samples Only ###
vpa.med.pos.smpl.pca <- vpa.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.smpl.pca.x <- as.data.frame(vpa.med.pos.smpl.pca$x)
vpa.med.pos.smpl.pca.x <- vpa.med.pos.smpl.pca.x %>% 
  bind_cols(
    vpa.med.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.pos.smpl.pca.x) <- vpa.med.pos.smpl.pca.x$Samples
vpa.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (80.9% Var)") +
  ylab("PC2 (11.2%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Positive Mode")

4 Batch Effects and Signifiance Testing

4.1 VPA metabolism

Is VPA metabolised by cells?

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  unite(Sample_Type, Group:Treatment, sep = "_") %>% 
  ggplot(aes(Sample_Type, VPAnM24)) +
  geom_jitter(alpha = 0.8, width = 0.1)

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  unite(Sample_Type, Group:Treatment, sep = "_") %>% 
  ggplot(aes(Sample_Type, VPAnM24)) +
  geom_boxplot()

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  filter(Treatment == "vpa") %>% 
  group_by(Group) %>% 
  summarize(
    vpa.avg = mean(VPAnM24),
    vpa.sd = sd(VPAnM24)
    )
# A tibble: 2 x 3
  Group  vpa.avg vpa.sd
  <chr>    <dbl>  <dbl>
1 empty     20.8  0.413
2 sample    20.8  0.506

It seems likely that VPA is not getting metabolised to a great extent, but cannot be sure.

4.2 Cells / Negative Mode

# sample prep
vpa.cell.neg.smpl.data <- vpa.cell.neg.noNA %>% 
    filter(Group == "sample")
vpa.cell.neg.data.for.sva <- as.matrix(
  vpa.cell.neg.smpl.data[, which(
    colnames(vpa.cell.neg.smpl.data) == "VPAnC1"
    ):ncol(vpa.cell.neg.smpl.data)]
  )
row.names(vpa.cell.neg.data.for.sva) <- vpa.cell.neg.smpl.data$Samples
vpa.cell.neg.data.for.sva <- t(vpa.cell.neg.data.for.sva)
# pheno prep
vpa.cell.neg.data.pheno <- as.data.frame(vpa.cell.neg.smpl.data[, 5:7])
row.names(vpa.cell.neg.data.pheno) <- vpa.cell.neg.smpl.data$Samples
# sva calculation
vpa.cell.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.neg.data.pheno)
vpa.cell.neg.mod0 <- model.matrix(~ 1, data = vpa.cell.neg.data.pheno)
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "be")
[1] 3
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.cell.neg.sv <- sva(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, vpa.cell.neg.mod0)
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vpa.cell.neg.surr.var <- as.data.frame(vpa.cell.neg.sv$sv)
colnames(vpa.cell.neg.surr.var) <- c("S1", "S2", "S3")
colnames(vpa.cell.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
# combine the full model matrix and the surrogate variables into one
vpa.cell.neg.d.sv <- cbind(vpa.cell.neg.mod.vpa, vpa.cell.neg.surr.var)  
head(vpa.cell.neg.d.sv)
         cntrl DRUGvsCNTRL          S1          S2          S3
T1_P1_C1     1           0 -0.02572670  0.30588286 -0.24101509
T1_P1_C2     1           0  0.22693416 -0.04248658 -0.16668619
T1_P1_C3     1           0  0.24683594  0.01195802 -0.16215771
T1_P1_V1     1           1  0.26304093  0.05597521  0.10204063
T1_P1_V2     1           1 -0.08024336  0.19758531  0.01981105
T1_P1_V3     1           1 -0.08965598  0.26184963 -0.00716443
vpa.cell.neg.top.table <- vpa.cell.neg.data.for.sva %>% 
  # fit a linear model 
  lmFit(vpa.cell.neg.d.sv) %>% 
  # calculate the test statistics
  eBayes() %>% 
  # select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.neg.data.for.sva))
# what the result looks like:
head(vpa.cell.neg.top.table)  
              logFC  AveExpr          t      P.Value    adj.P.Val
VPAnC119 -1.1323186 16.34932 -16.409667 3.271285e-13 5.855600e-11
VPAnC152 -1.3534317 16.93098 -11.248220 3.410586e-10 6.104950e-08
VPAnC114  0.9294965 15.08073  10.511594 1.116320e-09 1.998213e-07
VPAnC148 -0.7049114 14.87538  -8.903691 1.840005e-08 3.293608e-06
VPAnC18  -0.6384362 21.20349  -8.657914 2.903710e-08 5.197640e-06
VPAnC97  -0.3901582 19.32692  -8.136154 7.849318e-08 1.405028e-05
                 B
VPAnC119 20.447355
VPAnC152 13.521747
VPAnC114 12.324752
VPAnC148  9.486352
VPAnC18   9.023421
VPAnC97   8.013953
# make result more informative
vpa.cell.neg.top.w.info <- vpa.cell.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.neg.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                         compound_full change_in_vpa
1         VPAnC33                          4-Oxoproline          down
2         VPAnC58                        N-Acetylserine          down
3         VPAnC65                              Xanthine          down
4         VPAnC97                          myo-Inositol          down
5         VPAnC41                            Asparagine          down
6         VPAnC23                           Thiosulfate          down
7         VPAnC46                           Deoxyribose          down
8        VPAnC135              Glycerol 1-phosphoserine          down
9         VPAnC18                         Glyceric Acid          down
10        VPAnC86                                   G3P          down
11       VPAnC136                 D-Glucose 6-phosphate          down
12       VPAnC148             Sedoheptulose 7-phosphate          down
13       VPAnC172                      Succinoadenosine          down
14       VPAnC144                            Xanthosine          down
15       VPAnC119        Glycerylphosphorylethanolamine          down
16        VPAnC10                           Lactic Acid          down
17        VPAnC48                               Adenine          down
18       VPAnC176                                  dTDP          down
19       VPAnC127                    Ribose 5-Phosphate          down
20       VPAnC152                             GlcNAc-1P          down
21       VPAnC159                                   CMP          down
22       VPAnC160                                   UMP          down
23       VPAnC138                               Inosine          down
24       VPAnC168                                   AMP          down
25       VPAnC169                                   GMP          down
26       VPAnC114                       Homocitric Acid            up
27        VPAnC54                         Valproic acid            up
28        VPAnC53                 O-Phosphoethanolamine            up
29       VPAnC200                           ADP-Glucose            up
30       VPAnC157 11Z,14Z-Eicosadienoic Acid (20:2 n-6)            up
31       VPAnC189                                   CTP            up
32        VPAnC67                      3-Sulfinoalanine            up
33        VPAnC44                         Aspartic Acid            up
34        VPAnC73                    2-Aminoadipic Acid            up
35       VPAnC124                         Cystathionine            up
36        VPAnC17                                Serine            up
37        VPAnC30                             Threonine            up
38         VPAnC2                               Glycine            up
   vpa_div_cntrl
1      0.8053737
2      0.7968679
3      0.7784193
4      0.7630459
5      0.7359651
6      0.7195058
7      0.7080025
8      0.6729909
9      0.6424089
10     0.6384422
11     0.6149253
12     0.6134802
13     0.5099818
14     0.4621206
15     0.4561820
16     0.4510010
17     0.4284833
18     0.4207822
19     0.4146314
20     0.3913600
21     0.3771595
22     0.3242992
23     0.2866219
24     0.1530079
25     0.1220719
26     1.9046112
27     1.8136293
28     1.5898490
29     1.5724338
30     1.5286555
31     1.4873520
32     1.4829863
33     1.4483811
34     1.4198508
35     1.4098145
36     1.2661473
37     1.2384215
38     1.2253333
#write_csv(path = "./results/vpa_cell_neg_top_hits_w_FC_pval.csv", vpa.cell.neg.top.w.info)
vpa.cell.neg.gathered <- vpa.cell.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.cell.neg.surr.var) %>% 
  select(Samples, Treatment, S1:S3, starts_with("VPAnC")) %>% 
  gather(key = "Compound", value = "Abundance", VPAnC1:VPAnC99)
# structure so far:
glimpse(vpa.cell.neg.gathered)
Observations: 4,117
Variables: 7
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ S1        <dbl> -0.02572670, 0.22693416, 0.24683594, 0.26304093, -0....
$ S2        <dbl> 0.30588286, -0.04248658, 0.01195802, 0.05597521, 0.1...
$ S3        <dbl> -0.2410150862, -0.1666861888, -0.1621577120, 0.10204...
$ Compound  <chr> "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "V...
$ Abundance <dbl> 15.08795, 14.67335, 14.81717, 14.55139, 14.79476, 15...
vpa.cell.neg.nested <- vpa.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  # apply a linear model to each individual compound, as a function of the surrogate variables
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2 + S3, data = .))) %>% 
  # use broom to tidy up the output
  mutate(augment_model = map(model, augment))
# result to far:
vpa.cell.neg.nested
# A tibble: 179 x 4
   Compound data              model    augment_model     
   <chr>    <list>            <list>   <list>            
 1 VPAnC1   <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 2 VPAnC10  <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 3 VPAnC100 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 4 VPAnC101 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 5 VPAnC102 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 6 VPAnC103 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 7 VPAnC104 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 8 VPAnC105 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
 9 VPAnC106 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
10 VPAnC107 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
# ... with 169 more rows
# now to get the residuals out for each compound
vpa.cell.neg.modSV.resid <- vpa.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  # return to long format
  spread(Compound, .resid) 
glimpse(vpa.cell.neg.modSV.resid[, 1:5])
Observations: 23
Variables: 5
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ VPAnC1    <dbl> 0.10319936, -0.06371124, 0.04982032, -0.24595544, -0...
$ VPAnC10   <dbl> 0.29867785, 0.67292980, 0.19366552, -1.03548312, -0....
$ VPAnC100  <dbl> 0.098012517, -0.085761072, 0.123939301, 0.084645145,...
vpa.cell.neg.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("VPAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
vpa.cell.neg.modSV.pca <- vpa.cell.neg.modSV.resid %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.cell.neg.modSV.pca.x <- as.data.frame(vpa.cell.neg.modSV.pca$x)
row.names(vpa.cell.neg.modSV.pca.x) <- vpa.cell.neg.modSV.resid$Samples
vpa.cell.neg.modSV.pca.x <- vpa.cell.neg.modSV.pca.x %>% 
  bind_cols(vpa.cell.neg.modSV.resid %>% select(Treatment))
vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (60.5% Var)") +
  ylab("PC2 (10.6% Var)")

vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.6% Var)") +
  ylab("PC4 (6.8% Var)")

4.3 Cells / Positive Mode

vpa.cell.pos.smpl.data <- vpa.cell.pos.noNA %>% 
    filter(Group == "sample")
vpa.cell.pos.data.for.sva <- as.matrix(
  vpa.cell.pos.smpl.data[, which(
    colnames(vpa.cell.pos.smpl.data) == "VPApC1"
    ):ncol(vpa.cell.pos.smpl.data)]
  )
row.names(vpa.cell.pos.data.for.sva) <- vpa.cell.pos.smpl.data$Samples
vpa.cell.pos.data.for.sva <- t(vpa.cell.pos.data.for.sva)
vpa.cell.pos.data.pheno <- as.data.frame(vpa.cell.pos.smpl.data[, 5:7])
row.names(vpa.cell.pos.data.pheno) <- vpa.cell.pos.smpl.data$Samples
# sva calculation
vpa.cell.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.pos.data.pheno)
vpa.cell.pos.mod0 <- model.matrix(~ 1, data = vpa.cell.pos.data.pheno)
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "be")
[1] 2
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "leek")
[1] 2
set.seed(2018)
vpa.cell.pos.sv <- sva(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, vpa.cell.pos.mod0)
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
vpa.cell.pos.surr.var <- as.data.frame(vpa.cell.pos.sv$sv)
colnames(vpa.cell.pos.surr.var) <- c("S1", "S2")
colnames(vpa.cell.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.cell.pos.d.sv <- cbind(vpa.cell.pos.mod.vpa, vpa.cell.pos.surr.var)  
head(vpa.cell.pos.d.sv)
         cntrl DRUGvsCNTRL         S1          S2
T1_P1_C1     1           0 -0.3187310  0.19917816
T1_P1_C2     1           0 -0.2279608 -0.06387899
T1_P1_C3     1           0 -0.1409242  0.01021199
T1_P1_V1     1           1 -0.2796968 -0.36213773
T1_P1_V2     1           1 -0.1999223 -0.13213535
T1_P1_V3     1           1 -0.2955297  0.16841744
vpa.cell.pos.top.table <- vpa.cell.pos.data.for.sva %>% 
  lmFit(vpa.cell.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.pos.data.for.sva))
vpa.cell.pos.top.w.info <- vpa.cell.pos.top.table %>%
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.pos.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.pos.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                  compound_full change_in_vpa
1         VPApC35                     Asparagine          down
2         VPApC40             N-Methylnicotinate          down
3         VPApC51                       Xanthine          down
4        VPApC101       Glycerol 1-phosphoserine          down
5         VPApC24                   Nicotinamide          down
6         VPApC49                       D-Ribose          down
7         VPApC89                         GlcNAc          down
8         VPApC67                            G3P          down
9        VPApC112                     Xanthosine          down
10        VPApC38                        Adenine          down
11        VPApC39                   Hypoxanthine          down
12       VPApC119                            CMP          down
13        VPApC14                         Uracil          down
14       VPApC134                            ADP          down
15       VPApC107                        Inosine          down
16        VPApC85 Glycerylphosphorylethanolamine          down
17        VPApC60                         Fucose          down
18        VPApC95                        Uridine          down
19        VPApC74                   D-Galactitol          down
20       VPApC120                            UMP          down
21       VPApC121                            NMN          down
22       VPApC100      Glycerol-3-phosphocholine          down
23        VPApC17                        Proline            up
24        VPApC37                  Aspartic Acid            up
25        VPApC52               3-Sulfinoalanine            up
26       VPApC172                       PC(36:4)            up
27       VPApC159                    ADP-Glucose            up
28        VPApC41          O-Phosphoethanolamine            up
29        VPApC56                Methyl-L-Lysine            up
30        VPApC90                  Cystathionine            up
31       VPApC105                       Thiamine            up
32       VPApC171                       PC(35:2)            up
   vpa_div_cntrl
1      0.8020385
2      0.7628304
3      0.7304178
4      0.6439406
5      0.6390530
6      0.6253119
7      0.6142356
8      0.6080481
9      0.5521731
10     0.4588743
11     0.4538572
12     0.4491053
13     0.4414234
14     0.4391555
15     0.4284642
16     0.4242119
17     0.4078264
18     0.4012096
19     0.3819708
20     0.3793175
21     0.3466167
22     0.1517526
23     1.8933351
24     1.5849782
25     1.5334580
26     1.4855587
27     1.4155297
28     1.3495668
29     1.3212269
30     1.2885595
31     1.2460124
32     1.2131600
#write_csv(path = "./results/vpa_cell_pos_top_hits_w_FC_pval.csv", vpa.cell.pos.top.w.info)
vpa.cell.pos.gathered <- vpa.cell.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.cell.pos.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("VPApC")) %>% 
  gather(key = "Compound", value = "Abundance", VPApC1:VPApC98)
vpa.cell.pos.nested <- vpa.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.cell.pos.modSV.resid <- vpa.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.cell.pos.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.pos.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("VPApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.pos.top.w.info$compound_full,
    k_row = 2, k_col = 2
    )
### PCA - cleaned data ###
vpa.cell.pos.modSV.pca <- vpa.cell.pos.modSV.resid %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.cell.pos.modSV.pca.x <- as.data.frame(vpa.cell.pos.modSV.pca$x)
row.names(vpa.cell.pos.modSV.pca.x) <- vpa.cell.pos.modSV.resid$Samples
vpa.cell.pos.modSV.pca.x <- vpa.cell.pos.modSV.pca.x %>% 
  bind_cols(vpa.cell.pos.modSV.resid %>% select(Treatment))
vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (65.4% Var)") +
  ylab("PC2 (12.4% Var)")

vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (6.5% Var)") +
  ylab("PC4 (4.5% Var)")

4.4 Media / Negative Mode

vpa.med.neg.smpl.data <- vpa.med.neg.noNA %>% 
    filter(Group == "sample")
vpa.med.neg.data.for.sva <- as.matrix(
  vpa.med.neg.smpl.data[, which(
    colnames(vpa.med.neg.smpl.data) == "VPAnM1"
    ):ncol(vpa.med.neg.smpl.data)]
  )
row.names(vpa.med.neg.data.for.sva) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.data.for.sva <- t(vpa.med.neg.data.for.sva)
vpa.med.neg.data.pheno <- as.data.frame(vpa.med.neg.smpl.data[, 5:7])
row.names(vpa.med.neg.data.pheno) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.neg.data.pheno)
vpa.med.neg.mod0 <- model.matrix(~ 1, data = vpa.med.neg.data.pheno)
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.neg.sv <- sva(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, vpa.med.neg.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
vpa.med.neg.surr.var <- as.data.frame(vpa.med.neg.sv$sv)
colnames(vpa.med.neg.surr.var) <- c("S1")
colnames(vpa.med.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.neg.d.sv <- cbind(vpa.med.neg.mod.vpa, vpa.med.neg.surr.var)  
head(vpa.med.neg.d.sv)
         cntrl DRUGvsCNTRL          S1
T1_P1_C1     1           0 -0.11876454
T1_P1_C2     1           0 -0.16142883
T1_P1_C3     1           0 -0.36348674
T1_P1_V1     1           1 -0.07160605
T1_P1_V2     1           1  0.16394296
T1_P1_V3     1           1  0.23197510
vpa.med.neg.top.table <- vpa.med.neg.data.for.sva %>% 
  lmFit(vpa.med.neg.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.neg.data.for.sva))
vpa.med.neg.top.w.info <- vpa.med.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.med.neg.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.med.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
  compound_short    compound_full change_in_vpa vpa_div_cntrl
1        VPAnM49 Arachidonic Acid          down     0.7855066
2        VPAnM24    Valproic acid            up    32.7464129
vpa.med.neg.gathered <- vpa.med.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.med.neg.surr.var) %>% 
  select(Samples, Treatment, S1, starts_with("VPAnM")) %>% 
  gather(key = "Compound", value = "Abundance", VPAnM1:VPAnM9)
vpa.med.neg.nested <- vpa.med.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.med.neg.modSV.resid <- vpa.med.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.med.neg.modSV.resid %>% 
  select(Samples, one_of(vpa.med.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("VPAnM") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only Experiment / Media / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.med.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
vpa.med.neg.modSV.pca <- vpa.med.neg.modSV.resid %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.med.neg.modSV.pca.x <- as.data.frame(vpa.med.neg.modSV.pca$x)
row.names(vpa.med.neg.modSV.pca.x) <- vpa.med.neg.modSV.resid$Samples
vpa.med.neg.modSV.pca.x <- vpa.med.neg.modSV.pca.x %>% 
  bind_cols(vpa.med.neg.modSV.resid %>% select(Treatment))
vpa.med.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (67.9% Var)") +
  ylab("PC2 (18.6% Var)")

vpa.med.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (5.4% Var)") +
  ylab("PC4 (3.4% Var)")

4.5 Media / Positive Mode

vpa.med.pos.smpl.data <- vpa.med.pos.noNA %>% 
    filter(Group == "sample")
vpa.med.pos.data.for.sva <- as.matrix(
  vpa.med.pos.smpl.data[, which(
    colnames(vpa.med.pos.smpl.data) == "VPApM1"
    ):ncol(vpa.med.pos.smpl.data)]
  )
row.names(vpa.med.pos.data.for.sva) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.data.for.sva <- t(vpa.med.pos.data.for.sva)
vpa.med.pos.data.pheno <- as.data.frame(vpa.med.pos.smpl.data[, 5:7])
row.names(vpa.med.pos.data.pheno) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.pos.data.pheno)
vpa.med.pos.mod0 <- model.matrix(~ 1, data = vpa.med.pos.data.pheno)
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.pos.sv <- sva(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, vpa.med.pos.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
vpa.med.pos.surr.var <- as.data.frame(vpa.med.pos.sv$sv)
colnames(vpa.med.pos.surr.var) <- c("S1")
colnames(vpa.med.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.pos.d.sv <- cbind(vpa.med.pos.mod.vpa, vpa.med.pos.surr.var)  
head(vpa.med.pos.d.sv)
         cntrl DRUGvsCNTRL          S1
T1_P1_C1     1           0  0.05493060
T1_P1_C2     1           0  0.19001851
T1_P1_C3     1           0  0.35675663
T1_P1_V1     1           1  0.02929437
T1_P1_V2     1           1 -0.20959541
T1_P1_V3     1           1 -0.24263772
vpa.med.pos.top.table <- vpa.med.pos.data.for.sva %>% 
  lmFit(vpa.med.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.pos.data.for.sva))
vpa.med.pos.top.table
data frame with 0 columns and 0 rows

5 Pathway analysis

5.1 KEGG attach

head(vpa.cell.neg.top.w.info)
  compound_short      logFC  AveExpr         t      P.Value    adj.P.Val
1        VPAnC33 -0.3122697 23.94261 -4.541768 1.904961e-04 3.409880e-02
2        VPAnC58 -0.3275876 17.48468 -4.542368 1.902279e-04 3.405079e-02
3        VPAnC65 -0.3613807 20.56905 -4.542536 1.901525e-04 3.403730e-02
4        VPAnC97 -0.3901582 19.32692 -8.136154 7.849318e-08 1.405028e-05
5        VPAnC41 -0.4422907 18.18660 -7.076306 6.614607e-07 1.184015e-04
6        VPAnC23 -0.4749217 18.27583 -6.244306 3.915075e-06 7.007984e-04
          B vpa_div_cntrl change_in_vpa  compound_full     formula    mass
1 0.1367131     0.8053737          down   4-Oxoproline  C5 H7 N O3 129.043
2 0.1381199     0.7968679          down N-Acetylserine  C5 H9 N O4 147.054
3 0.1385153     0.7784193          down       Xanthine C5 H4 N4 O2 152.034
4 8.0139530     0.7630459          down   myo-Inositol   C6 H12 O6 180.064
5 5.8501353     0.7359651          down     Asparagine C4 H8 N2 O3 132.054
6 4.0475648     0.7195058          down    Thiosulfate    H2 O3 S2 113.945
    rt     cas_id
1 5.98  4347-18-6
2 3.38 16354-58-8
3 1.92    69-89-6
4 5.02    87-89-8
5 8.37    70-47-3
6 4.85 14383-50-7
head(vpa.cell.pos.top.w.info)
  compound_short      logFC  AveExpr         t      P.Value   adj.P.Val
1        VPApC35 -0.3182566 18.67222 -4.853699 7.687773e-05 0.010916637
2        VPApC40 -0.3905657 15.41180 -4.621143 1.350775e-04 0.019181004
3        VPApC51 -0.4532062 16.91997 -4.489349 1.860706e-04 0.026422023
4       VPApC101 -0.6350004 14.04687 -5.753222 8.991813e-06 0.001276837
5        VPApC24 -0.6459926 19.38535 -4.275739 3.128901e-04 0.044430397
6        VPApC49 -0.6773521 14.61980 -4.311187 2.870291e-04 0.040758128
            B vpa_div_cntrl change_in_vpa            compound_full
1  0.58407812     0.8020385          down               Asparagine
2  0.01844396     0.7628304          down       N-Methylnicotinate
3 -0.30211613     0.7304178          down                 Xanthine
4  2.75153845     0.6439406          down Glycerol 1-phosphoserine
5 -0.82081936     0.6390530          down             Nicotinamide
6 -0.73485577     0.6253119          down                 D-Ribose
        formula    mass    rt     cas_id
1   C4 H8 N2 O3 132.054  8.39    70-47-3
2    C7 H7 N O2 137.048 10.39   535-83-1
3   C5 H4 N4 O2 152.034  1.92    69-89-6
4 C6 H14 N O8 P 259.045  8.00 26289-09-8
5    C6 H6 N2 O 122.048  1.90    98-92-0
6     C5 H10 O5 150.052  2.45    50-69-1
head(vpa.med.neg.top.w.info)
  compound_short      logFC  AveExpr         t      P.Value    adj.P.Val
1        VPAnM49 -0.3483048 14.19924 -4.089121 4.528439e-04 1.856660e-02
2        VPAnM24  5.0332650 18.25088 50.453771 5.235543e-25 2.146573e-23
          B vpa_div_cntrl change_in_vpa    compound_full    formula
1 -2.342467     0.7855066          down Arachidonic Acid C20 H32 O2
2 47.332801    32.7464129            up    Valproic acid  C8 H16 O2
     mass   rt   cas_id
1 304.241 1.17 506-32-1
2 144.115 1.30  99-66-1
vpa.full.hit.list <- vpa.cell.neg.top.w.info %>% 
  mutate(Mode = "cell.neg") %>% 
  bind_rows(
    vpa.cell.pos.top.w.info %>% 
      mutate(Mode = "cell.pos")
  ) %>% 
  bind_rows(
    vpa.med.neg.top.w.info %>% 
      mutate(Mode = "med.neg")
  ) %>% 
  as.tibble()
vpa.sml.hit.list <- vpa.full.hit.list %>% 
  select(compound_full:formula, cas_id) %>% 
  distinct() %>% 
  arrange(compound_full) %>% 
  left_join(cmpnd.id.db, by = "cas_id")
all.equal(vpa.sml.hit.list$compound_full.x, vpa.sml.hit.list$compound_full.y)
[1] TRUE
#write_csv(path = "./results/vpa_only_exp_hits_w_kegg.csv", vpa.sml.hit.list)
glimpse(vpa.sml.hit.list)
Observations: 56
Variables: 7
$ compound_full.x <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ formula         <chr> "C20 H36 O2", "C6 H11 N O4", "C3 H7 N O4 S", "...
$ cas_id          <chr> "2091-39-6", "542-32-5", "1115-65-7", "4347-18...
$ compound_full.y <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ HMDB            <chr> "HMDB05060", "HMDB00510", "HMDB00996", "None",...
$ Metlin          <chr> "62964", "3271", "5927", "63471", "85", "34522...
$ KEGG            <chr> "C16525", "C00956", "C00606", "C01877", "C0014...

5.2 Nucleotide metabolism plots

nucleotide.metab <- read_csv("./data/pathways/vpa_only_exp_nucleotide_hits.csv")
### Purine Metabolism ###
purine.triphosphates <- c("ATP", "GTP", "dATP", "dGTP")
purine.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "Purine" | path2 == "Purine" | path3 == "Purine") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x) %>% 
  bind_rows(
    vpa.cell.neg.compound.info %>% 
      filter(compound_full %in% purine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  bind_rows(
    vpa.cell.pos.compound.info %>% 
      filter(compound_full %in% purine.triphosphates) %>% 
      select(compound_full, compound_short)
  )
#write_csv(path = "./data/pathways/purine_pathway.csv", purine.for.plot)  do not run
purine.plot.order <- read_csv("./data/pathways/purine_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
purine.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(purine.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
purine.sig.plot <- purine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(
    purine.plot.order %>% 
      filter(compound_full != "Aspartic Acid" & compound_full != "Ribose 5-Phosphate"), 
    by = "compound_short"
    ) %>%
  filter(Sig == "Y") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
 # ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
purine.sig.plot

purine.not.sig.plot <- purine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(purine.plot.order, by = "compound_short") %>%
  filter(Sig == "N") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
purine.not.sig.plot

### Pyrimidine Metabolism ###
pyrimidine.triphosphates <- c("UTP", "CTP", "dTTP", "dCTP")
pyrimidine.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "Pyrimidine" | path2 == "Pyrimidine" | path3 == "Pyrimidine") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x) %>% 
  bind_rows(
    vpa.cell.neg.compound.info %>% 
      filter(compound_full %in% pyrimidine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  bind_rows(
    vpa.cell.pos.compound.info %>% 
      filter(compound_full %in% pyrimidine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  distinct()
#write_csv(path = "./data/pathways/pyrimidine_pathway.csv", pyrimidine.for.plot)
pyrimidine.plot.order <- read_csv("./data/pathways/pyrimidine_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
pyrimidine.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(pyrimidine.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
pyrimidine.sig.plot <- pyrimidine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(
    pyrimidine.plot.order %>% 
      filter(compound_full != "Ribose 5-Phosphate"), by = "compound_short") %>%
  filter(Sig == "Y") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
pyrimidine.sig.plot

pyrimidine.not.sig.plot <- pyrimidine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(pyrimidine.plot.order, by = "compound_short") %>%
  filter(Sig == "N") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
pyrimidine.not.sig.plot

### Pentose Phosphate Pathway ###
ppp.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "PPP" | path2 == "PPP" | path3 == "PPP") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x)
#write_csv(path = "./data/pathways/ppp_pathway.csv", ppp.for.plot)
ppp.plot.order <- read_csv("./data/pathways/ppp_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
ppp.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(ppp.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
ppp.sig.plot <- ppp.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(ppp.plot.order, by = "compound_short") %>%
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90"),
    legend.justification = "top"
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nppp Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.75, 2.5) +
  scale_x_discrete(labels = c("Glucose\n6-Phosphate", "Ribose", "Deoxyribose", "Glyceraldehyde\n3-Phosphate (Neg)", "Glyceraldehyde\n3-Phosphate (Pos)", "Ribose\n5-Phosphate", "Sedoheptulose\n7-Phosphate"))
ppp.sig.plot

### all together
nuc.legend <- get_legend(ppp.sig.plot)
vpa.only.nuc.sig.plot.grid <- plot_grid(
  purine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(6,6,0,0), "pt")), 
  pyrimidine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")), 
  plot_grid(
    ppp.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
    nuc.legend,
    rel_widths = c(1.45, 0.55)
    ),
  ncol = 1, labels = c("A", "B", "C"),
  rel_heights = c(1, 1, 1)
  )
vpa.only.nuc.sig.plot.grid

#save_plot("./results/vpa_only_exp_nuc_sig.png", vpa.only.nuc.sig.plot.grid, base_height = 10, base_width = 8)
nuc.not.sig.row.plots <- plot_grid(
  purine.not.sig.plot, 
  pyrimidine.not.sig.plot, 
  labels = c("A", "B"), 
  ncol = 1
  )
nuc.not.sig.row.plots

#save_plot("./results/vpa_only_exp_nuc_not_sig.png", nuc.not.sig.row.plots, base_width = 8, base_height = 8)

6 Session Info

sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2.2      ggridges_0.5.1      broom_0.5.0        
 [4] limma_3.36.5        sva_3.28.0          BiocParallel_1.14.2
 [7] genefilter_1.62.0   mgcv_1.8-25         nlme_3.1-137       
[10] heatmaply_0.15.2    viridis_0.5.1       viridisLite_0.3.0  
[13] plotly_4.8.0        GGally_1.4.0        cowplot_0.9.3      
[16] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
[19] purrr_0.2.5         readr_1.1.1         tidyr_0.8.2        
[22] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    

loaded via a namespace (and not attached):
  [1] colorspace_1.3-2     class_7.3-14         modeltools_0.2-22   
  [4] mclust_5.4.1         rprojroot_1.3-2      rstudioapi_0.8      
  [7] flexmix_2.3-14       bit64_0.9-7          fansi_0.4.0         
 [10] AnnotationDbi_1.42.1 mvtnorm_1.0-8        lubridate_1.7.4     
 [13] xml2_1.2.0           splines_3.5.1        codetools_0.2-15    
 [16] robustbase_0.93-3    knitr_1.20           jsonlite_1.5        
 [19] annotate_1.58.0      cluster_2.0.7-1      kernlab_0.9-27      
 [22] shiny_1.2.0          compiler_3.5.1       httr_1.3.1          
 [25] backports_1.1.2      assertthat_0.2.0     Matrix_1.2-15       
 [28] lazyeval_0.2.1       cli_1.0.1            later_0.7.5         
 [31] htmltools_0.3.6      tools_3.5.1          gtable_0.2.0        
 [34] glue_1.3.0           reshape2_1.4.3       Rcpp_1.0.0          
 [37] Biobase_2.40.0       cellranger_1.1.0     trimcluster_0.1-2.1 
 [40] gdata_2.18.0         crosstalk_1.0.0      iterators_1.0.10    
 [43] fpc_2.1-11.1         rvest_0.3.2          mime_0.6            
 [46] gtools_3.8.1         XML_3.98-1.16        dendextend_1.9.0    
 [49] DEoptimR_1.0-8       MASS_7.3-51.1        scales_1.0.0        
 [52] TSP_1.1-6            promises_1.0.1       hms_0.4.2           
 [55] parallel_3.5.1       RColorBrewer_1.1-2   yaml_2.2.0          
 [58] memoise_1.1.0        gridExtra_2.3        reshape_0.8.8       
 [61] stringi_1.2.4        RSQLite_2.1.1        gclus_1.3.1         
 [64] S4Vectors_0.18.3     foreach_1.4.4        seriation_1.2-3     
 [67] caTools_1.17.1.1     BiocGenerics_0.26.0  matrixStats_0.54.0  
 [70] rlang_0.3.0.1        pkgconfig_2.0.2      prabclus_2.2-6      
 [73] bitops_1.0-6         evaluate_0.12        lattice_0.20-38     
 [76] bindr_0.1.1          labeling_0.3         htmlwidgets_1.3     
 [79] bit_1.1-14           tidyselect_0.2.5     plyr_1.8.4          
 [82] magrittr_1.5         R6_2.3.0             IRanges_2.14.12     
 [85] gplots_3.0.1         DBI_1.0.0            pillar_1.3.0        
 [88] haven_1.1.2          whisker_0.3-2        withr_2.1.2         
 [91] survival_2.43-1      RCurl_1.95-4.11      nnet_7.3-12         
 [94] modelr_0.1.2         crayon_1.3.4         utf8_1.1.4          
 [97] KernSmooth_2.23-15   rmarkdown_1.10       grid_3.5.1          
[100] readxl_1.1.0         data.table_1.11.8    blob_1.1.1          
[103] digest_0.6.18        diptest_0.75-7       webshot_0.5.1       
[106] xtable_1.8-3         httpuv_1.4.5         stats4_3.5.1        
[109] munsell_0.5.0        registry_0.5